001    /*
002     * RandomFactorSequenceGenerator.java
003     *
004     * Copyright 2003 Sergio Anibal de Carvalho Junior
005     *
006     * This file is part of NeoBio.
007     *
008     * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009     * the GNU General Public License as published by the Free Software Foundation; either
010     * version 2 of the License, or (at your option) any later version.
011     *
012     * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013     * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014     * PURPOSE. See the GNU General Public License for more details.
015     *
016     * You should have received a copy of the GNU General Public License along with NeoBio;
017     * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018     * Boston, MA 02111-1307, USA.
019     *
020     * Proper attribution of the author as the source of the software would be appreciated.
021     *
022     * Sergio Anibal de Carvalho Junior             mailto:sergioanibaljr@users.sourceforge.net
023     * Department of Computer Science               http://www.dcs.kcl.ac.uk
024     * King's College London, UK                    http://www.kcl.ac.uk
025     *
026     * Please visit http://neobio.sourceforge.net
027     *
028     * This project was supervised by Professor Maxime Crochemore.
029     *
030     */
031    
032    package neobio.textui;
033    
034    import java.io.BufferedWriter;
035    import java.io.Writer;
036    import java.io.FileWriter;
037    import java.io.OutputStreamWriter;
038    import java.io.IOException;
039    
040    /**
041     * This class is a simple command line based utility for generating random sequences with
042     * optimal LZ78 factorisation.
043     *
044     * <P>The main method takes three parameters from the command line to generate a
045     * sequence: <CODE>type</CODE>, <CODE>size</CODE> and <CODE>file</CODE>, where:
046     * <UL>
047     * <LI><B><CODE>type</CODE></B> is either <CODE>DNA</CODE> for DNA sequences or
048     * <CODE>PROT</CODE> for protein sequences.
049     * <LI><B><CODE>size</CODE></B> is the number os characters.
050     * <LI><B><CODE>file</CODE></B> (optional) is the name of a file (if ommited, sequence
051     * is written to standard output).
052     * </UL>
053     * </P>
054     *
055     * @author Sergio A. de Carvalho Jr.
056     */
057    public class RandomFactorSequenceGenerator
058    {
059            /**
060             * Character set for DNA sequences.
061             */
062            private static final char[] DNA_CHARS = {'A', 'C', 'G', 'T'};
063    
064            /**
065             * Character set for protein sequences.
066             */
067            private static final char[] PROT_CHARS = {'A','R','N','D','C','Q','E','G','H','I',
068                                                                    'L','K','M','F','P','S','T','W','Y','V','B','Z','X'};
069    
070            /**
071             * The main method takes three parameters from the command line to generate a
072             * sequence. See the class description for details.
073             *
074             * @param args command line arguments
075             */
076            public static void main (String[] args)
077            {
078                    Writer          output;
079                    String          seq_type, filename;
080                    int                     size, random;
081                    char[]          charset;
082                    int[]           qty;
083                    int[]           factor;
084    
085                    try
086                    {
087                            // get 1st argument (required): file type
088                            seq_type = args[0];
089    
090                            // get 2nd argument (required): number of characters
091                            size = Integer.parseInt(args[1]);
092                    }
093                    catch (ArrayIndexOutOfBoundsException e)
094                    {
095                            usage();
096                            System.exit(1);
097                            return;
098                    }
099                    catch (NumberFormatException e)
100                    {
101                            usage();
102                            System.exit(1);
103                            return;
104                    }
105    
106                    // validate character set
107                    if (seq_type.equalsIgnoreCase("DNA"))
108                            charset = DNA_CHARS;
109                    else if (seq_type.equalsIgnoreCase("PROT"))
110                            charset = PROT_CHARS;
111                    else
112                    {
113                            // no such option
114                            usage();
115                            System.exit(1);
116                            return;
117                    }
118    
119                    // validate size
120                    if (size <= 3)
121                    {
122                            System.err.println ("Error: size must be greater than 3.");
123                            System.exit(1);
124                            return;
125                    }
126    
127                    try
128                    {
129                            // get 3rd argument (optional): file name
130                            filename = args[2];
131    
132                            try
133                            {
134                                    // open file for writing
135                                    output = new BufferedWriter (new FileWriter (filename));
136                            }
137                            catch (IOException e)
138                            {
139                                    System.err.println ("Error: couldn't open " + filename + " for writing.");
140                                    e.printStackTrace();
141                                    System.exit(2);
142                                    return;
143                            }
144                    }
145                    catch (ArrayIndexOutOfBoundsException e)
146                    {
147                            // file name was ommited, use standard output
148                            filename = null;
149                            output = new OutputStreamWriter (System.out);
150                    }
151    
152                    // alocate an of characters statistics
153                    qty = new int[charset.length];
154    
155                    // alocate an array to store the growing factor
156                    // its size will be no greather than half sequence size
157                    // (in fact, it's much less than that!)
158                    factor = new int [size / 2];
159    
160                    try
161                    {
162                            int s = 0, i, f_size = 0;
163    
164                            // write sequence
165                            while (s < size)
166                            {
167                                    // copy previous factor
168                                    for (i = 0; i < f_size && s < size; i++)
169                                    {
170                                            output.write(charset[factor[i]]);
171    
172                                            s++;
173    
174                                            // keep track of how many characters
175                                            // have been writen of each type
176                                            qty[factor[i]]++;
177                                    }
178    
179                                    if (s < size)
180                                    {
181    
182                                            // choose a character index randomly
183                                            random = (int) (Math.random() * charset.length);
184    
185                                            // extend factor with the random char index
186                                            factor[f_size++] = random;
187    
188                                            // keep track of how many characters
189                                            // have been writen of each type
190                                            qty[random]++;
191    
192                                            output.write(charset[random]);
193    
194                                            s++;
195                                    }
196                            }
197    
198                            output.flush();
199    
200                            if (filename != null) output.close();
201                    }
202                    catch (IOException e)
203                    {
204                            System.err.println ("Error: failed to write sequence.");
205                            e.printStackTrace();
206                            System.exit(2);
207                            return;
208                    }
209    
210                    // print character distribution
211                    System.out.println("\nCharacter distribution:");
212                    for (int i = 0; i < charset.length; i++)
213                            System.err.println(charset[i] + ": " + qty[i]);
214    
215                    System.exit(0);
216            }
217    
218            /**
219             * Prints command line usage.
220             */
221            private static void usage ()
222            {
223                    System.err.println(
224                    "\nUsage: RandomFactorSequenceGenerator <type> <size> [<file>]\n\n" +
225                    "where:\n\n" +
226                    "   <type> = DNA for nucleotide sequences\n" +
227                    "         or PROT for protein sequences\n\n" +
228                    "   <size> = number os characters\n\n" +
229                    "   <file> = name of a file to where the sequence is to be written\n" +
230                    "            (if ommited, sequence is written to standard output)"
231                    );
232            }
233    }